<!DOCTYPE html>
<html class="has-navbar-fixed-top">
<head>
    <meta charset="utf-8">
<title>新浪微博移动端签到数据爬取 - wanzixin</title>
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">

<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/outdated-browser/1.1.5/outdatedbrowser.min.css">


<link href="/zh-cn/Item/%E6%96%B0%E6%B5%AA%E5%BE%AE%E5%8D%9A%E7%A7%BB%E5%8A%A8%E7%AB%AF%E7%AD%BE%E5%88%B0%E6%95%B0%E6%8D%AE%E7%88%AC%E5%8F%96/" rel="alternate" hreflang="zh-CN" />
    


<meta name="description" content="">





    <meta name="description" content="爬取网站为新浪微博移动端，相对于PC端而言网页结构简单且限制较少，签到页不需要模拟登录。以城市为单位爬取新浪微博移动端POI下的签到微博，存入csv文件。本项目开源在GitHub，地址指路。 更新，西刺代理无法再使用，请更换其他代理。">
<meta property="og:type" content="article">
<meta property="og:title" content="新浪微博移动端签到数据爬取">
<meta property="og:url" content="https://wanzixin.github.io/Item/%E6%96%B0%E6%B5%AA%E5%BE%AE%E5%8D%9A%E7%A7%BB%E5%8A%A8%E7%AB%AF%E7%AD%BE%E5%88%B0%E6%95%B0%E6%8D%AE%E7%88%AC%E5%8F%96/index.html">
<meta property="og:site_name" content="wanzixin">
<meta property="og:description" content="爬取网站为新浪微博移动端，相对于PC端而言网页结构简单且限制较少，签到页不需要模拟登录。以城市为单位爬取新浪微博移动端POI下的签到微博，存入csv文件。本项目开源在GitHub，地址指路。 更新，西刺代理无法再使用，请更换其他代理。">
<meta property="og:locale" content="en_US">
<meta property="og:image" content="https://img.shields.io/badge/python-3.7-blue.svg">
<meta property="article:published_time" content="2020-05-25T10:01:51.000Z">
<meta property="article:modified_time" content="2021-07-18T02:39:18.761Z">
<meta property="article:author" content="wanzixin">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://img.shields.io/badge/python-3.7-blue.svg">





<link rel="icon" href="/favicon.png">


<link rel="stylesheet" href="//fonts.googleapis.com/css?family=Ovo|Source+Code+Pro">
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/bulma/0.6.2/css/bulma.min.css">


<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/lightgallery/1.6.8/css/lightgallery.min.css">
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/justifiedGallery/3.6.5/css/justifiedGallery.min.css">


<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/atom-one-light.min.css">


<link rel="stylesheet" href="/css/style.css">


<script defer src="//use.fontawesome.com/releases/v5.0.8/js/all.js"></script>


    
    
    
    
    
    
    
    
    
    

    


<meta name="generator" content="Hexo 5.4.0"></head>
<body>
    
<nav class="navbar is-transparent is-fixed-top navbar-main" role="navigation" aria-label="main navigation">
    <div class="container">
        <div class="navbar-brand">
            <a class="navbar-item navbar-logo" href="/">
                
                    
                    wanzixin
                    
                
            </a>
            <div class="navbar-burger">
                <span></span>
                <span></span>
                <span></span>
            </div>
        </div>
        
        <div class="navbar-menu navbar-start">
            
            <a class="navbar-item "
               href="/archives">Archives</a>
            
            <a class="navbar-item "
               href="/categories">Categories</a>
            
            <a class="navbar-item "
               href="/categories/Diary">Diary</a>
            
            <a class="navbar-item "
               href="/categories/Gallery">Gallery</a>
            
            <a class="navbar-item "
               href="/categories/Study">Study</a>
            
            <a class="navbar-item "
               href="/categories/Item">Item</a>
            
            <a class="navbar-item "
               href="/about">About</a>
            
        </div>
        
        <div class="navbar-menu navbar-end">
            
            <a class="navbar-item search" title="Search" href="javascript:;">
                <i class="fas fa-search"></i>
            </a>
            
            
            <div class="navbar-item is-hoverable has-dropdown is-hidden-mobile is-hidden-tablet-only toc">
                <a class="navbar-item" title="Table of Contents">
                    <i class="fa fa-list"></i>
                </a>
                <div class="navbar-dropdown is-right">
                    
                    
                    
                    
                    <a class="navbar-item" href="#文件说明">1&nbsp;&nbsp;<b>文件说明</b></a>
                    
                    
                    <hr class="navbar-divider">
                    
                    
                    <a class="navbar-item" href="#爬取信息">2&nbsp;&nbsp;<b>爬取信息</b></a>
                    
                    
                    <hr class="navbar-divider">
                    
                    
                    <a class="navbar-item" href="#思路">3&nbsp;&nbsp;<b>思路</b></a>
                    
                    
                    <hr class="navbar-divider">
                    
                    
                    <a class="navbar-item" href="#使用方法">4&nbsp;&nbsp;<b>使用方法</b></a>
                    
                    
                    <hr class="navbar-divider">
                    
                    
                    <a class="navbar-item" href="#依赖的第三方库">5&nbsp;&nbsp;<b>依赖的第三方库</b></a>
                    
                </div>
            </div>
            
            
            <a class="navbar-item" title="GitHub" target="_blank" rel="noopener" href="https://github.com/wanzixin">
                
                <i class="fab fa-github"></i>
                
            </a>
               
            
        </div>
    </div>
</nav>

    <section class="section">
    <div class="container">
    <article class="article content gallery" itemscope itemprop="blogPost">
    <h1 class="article-title is-size-3 is-size-4-mobile" itemprop="name">
        
            新浪微博移动端签到数据爬取
        
    </h1>
    <div class="article-meta columns is-variable is-1 is-multiline is-mobile is-size-7-mobile">
        <span class="column is-narrow">
            
                <span>May 25 2020</span>
            
        </span>
        
        <span class="column is-narrow article-category">
            <i class="far fa-folder"></i>
            <a class="article-category-link" href="/categories/Item/">Item</a>
        </span>
        
        
        <span class="column is-narrow">
            
            
            5 minutes read (About 689 words)
        </span>
        
    </div>
    <div class="article-entry is-size-6-mobile" itemprop="articleBody">
    
        <html><head></head><body><p><img src="https://img.shields.io/badge/python-3.7-blue.svg"></p>
<p>爬取网站为<a target="_blank" rel="noopener" href="https://m.weibo.cn/">新浪微博移动端</a>，相对于PC端而言网页结构简单且限制较少，签到页不需要模拟登录。以城市为单位爬取新浪微博移动端POI下的签到微博，存入csv文件。本项目开源在GitHub，地址<a target="_blank" rel="noopener" href="https://github.com/WanZixin/SinaWeibo-LocationSignIn-spider">指路</a>。</p>
<blockquote class="colorquote danger"><p>更新，西刺代理无法再使用，请更换其他代理。</p>
</blockquote>

<span id="more"></span>

<h2 id="文件说明"><a href="#文件说明" class="headerlink" title="文件说明"></a>文件说明</h2><p>buildip.py，爬取 <a target="_blank" rel="noopener" href="https://www.xicidaili.com/nn/">西刺高匿代理</a> 构建代理池。<br>myemail.py，爬取完毕后发邮件给自己的邮箱。<br>wifi.py，确保网络连接不断开（网络断开后自动重连）。<br>crawler.py，爬虫本体。<br>config.ini，配置文件，配置项有邮箱，wifi名称，城市名称，城市编码。</p>
<h2 id="爬取信息"><a href="#爬取信息" class="headerlink" title="爬取信息"></a>爬取信息</h2><table>
<thead>
<tr>
<th align="center">字段</th>
<th align="center">含义</th>
</tr>
</thead>
<tbody><tr>
<td align="center">user_id</td>
<td align="center">用户ID</td>
</tr>
<tr>
<td align="center">user_name</td>
<td align="center">昵称</td>
</tr>
<tr>
<td align="center">gender</td>
<td align="center">性别</td>
</tr>
<tr>
<td align="center">tweets</td>
<td align="center">微博文本</td>
</tr>
<tr>
<td align="center">textLength</td>
<td align="center">微博文本长度</td>
</tr>
<tr>
<td align="center">created_at</td>
<td align="center">发布时间</td>
</tr>
<tr>
<td align="center">source</td>
<td align="center">发布端</td>
</tr>
<tr>
<td align="center">followers_count</td>
<td align="center">粉丝数</td>
</tr>
<tr>
<td align="center">follow_count</td>
<td align="center">关注数</td>
</tr>
<tr>
<td align="center">statuses_count</td>
<td align="center">历史微博数</td>
</tr>
<tr>
<td align="center">profile_url</td>
<td align="center">主页链接</td>
</tr>
<tr>
<td align="center">pic_num</td>
<td align="center">图片数</td>
</tr>
<tr>
<td align="center">pics_url</td>
<td align="center">图片链接</td>
</tr>
<tr>
<td align="center">reposts_count</td>
<td align="center">转发数</td>
</tr>
<tr>
<td align="center">comments_count</td>
<td align="center">评论数</td>
</tr>
<tr>
<td align="center">attitudes_count</td>
<td align="center">态度数</td>
</tr>
</tbody></table>
<h2 id="思路"><a href="#思路" class="headerlink" title="思路"></a>思路</h2><ol>
<li>爬取城市页面，例如武汉市的URL为<a target="_blank" rel="noopener" href="https://m.weibo.cn/p/1001018008642010000000000">https://m.weibo.cn/p/1001018008642010000000000</a>，获取城市下的所有POI，写入&lt;cityName&gt;.csv文件。其实这里POI的名称和id可以直接传到下一步中，而这里生成csv文件是为了本地存储POI的信息供后一步使用。</li>
<li>读取上一步生成的csv文件读出POI的name和id，再构造URL爬取POI下的微博信息，例如黄鹤楼URL是<a target="_blank" rel="noopener" href="https://m.weibo.cn/p/index?containerid=100101B2094655D464A3FF439C">https://m.weibo.cn/p/index?containerid=100101B2094655D464A3FF439C</a>。</li>
</ol>
<h2 id="使用方法"><a href="#使用方法" class="headerlink" title="使用方法"></a>使用方法</h2><p>修改config.ini文件，email_address填写自己的邮箱，wifi填写已连接过的wifi名称，cityName填写爬取的城市名称，cityId填写城市编码。</p>
<p>城市编码参考新浪微博开放平台的<a target="_blank" rel="noopener" href="https://open.weibo.com/wiki/%E7%9C%81%E4%BB%BD%E5%9F%8E%E5%B8%82%E7%BC%96%E7%A0%81%E8%A1%A8">省份城市编码表</a>，举例如下：湖北省的省份编码为42，武汉市编码为1，则武汉市的编码为4201。值得注意的一点是：北京、上海、天津、重庆四个直辖市的编码后两位均为0，不再继续向下区分，北京市：1100，例如北京海淀区对应代码为1108，爬取不到内容。</p>
<h2 id="依赖的第三方库"><a href="#依赖的第三方库" class="headerlink" title="依赖的第三方库"></a>依赖的第三方库</h2><ul>
<li>requests</li>
<li>pandas</li>
<li>configparser</li>
<li>fake_useragent  </li>
</ul>
</body></html>
    
    </div>
    
    
    <div class="columns is-mobile is-multiline article-nav">
        <span class="column is-12-mobile is-half-desktop  article-nav-prev">
            
            <a href="/Item/%E4%BD%BF%E7%94%A8%E9%AB%98%E5%BE%B7Web%E6%9C%8D%E5%8A%A1API%E7%94%9F%E6%88%90shp%E6%96%87%E4%BB%B6/">使用高德Web服务API生成shp文件</a>
            
        </span>
        <span class="column is-12-mobile is-half-desktop  article-nav-next">
            
            <a href="/Study/Git%E5%B8%B8%E7%94%A8%E5%91%BD%E4%BB%A4/">Git常用命令</a>
            
        </span>
    </div>
    
</article>


<div class="sharebox">
    
<div class="sharethis-inline-share-buttons"></div>
<script type='text/javascript' src='//platform-api.sharethis.com/js/sharethis.js#property=608c1408daac690012507aa2&amp;product=sop' async='async'></script>

</div>



    </div>
</section>
    <footer class="footer">
    <div class="container">
        <div class="columns content">
            <div class="column is-narrow has-text-centered">
                &copy; 2021 wanzixin&nbsp;
                Powered by <a href="http://hexo.io/" target="_blank">Hexo</a> & <a
                        target="_blank" rel="noopener" href="http://github.com/ppoffice/hexo-theme-minos">Minos</a>
            </div>
            <div class="column is-hidden-mobile"></div>

            
            <div class="column is-narrow">
                <div class="columns is-mobile is-multiline is-centered">
                
                    
                <a class="column is-narrow has-text-black" title="GitHub" target="_blank" rel="noopener" href="https://github.com/ppoffice/hexo-theme-minos">
                    
                    GitHub
                    
                </a>
                
                </div>
            </div>
            
            
<div class="column is-narrow has-text-centered">
    <div class="dropdown is-up is-right is-hoverable" style="margin-top: -0.2em;">
        <div class="dropdown-trigger">
            <button class="button is-small" aria-haspopup="true" aria-controls="dropdown-menu7">
                <span class="icon">
                    <i class="fas fa-globe"></i>
                </span>
                <span>English</span>
                <span class="icon is-small">
            <i class="fas fa-angle-down" aria-hidden="true"></i>
          </span>
            </button>
        </div>
        <div class="dropdown-menu has-text-left" role="menu">
            <div class="dropdown-content">
            
                <a href="/Item/%E6%96%B0%E6%B5%AA%E5%BE%AE%E5%8D%9A%E7%A7%BB%E5%8A%A8%E7%AB%AF%E7%AD%BE%E5%88%B0%E6%95%B0%E6%8D%AE%E7%88%AC%E5%8F%96/" class="dropdown-item">
                    English
                </a>
            
                <a href="/zh-cn/Item/%E6%96%B0%E6%B5%AA%E5%BE%AE%E5%8D%9A%E7%A7%BB%E5%8A%A8%E7%AB%AF%E7%AD%BE%E5%88%B0%E6%95%B0%E6%8D%AE%E7%88%AC%E5%8F%96/" class="dropdown-item">
                    简体中文
                </a>
            
            </div>
        </div>
    </div>
</div>

        </div>
    </div>
</footer>
    <script src="//cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script src="//cdnjs.cloudflare.com/ajax/libs/moment.js/2.22.2/moment-with-locales.min.js"></script>

<!-- test if the browser is outdated -->
<div id="outdated">
    <h6>Your browser is out-of-date!</h6>
    <p>Update your browser to view this website correctly. <a id="btnUpdateBrowser" target="_blank" rel="noopener" href="http://outdatedbrowser.com/">Update my browser now </a></p>
    <p class="last"><a href="#" id="btnCloseUpdateBrowser" title="Close">&times;</a></p>
</div>
<script src="//cdnjs.cloudflare.com/ajax/libs/outdated-browser/1.1.5/outdatedbrowser.min.js"></script>
<script>
    $(document).ready(function () {
        // plugin function, place inside DOM ready function
        outdatedBrowser({
            bgColor: '#f25648',
            color: '#ffffff',
            lowerThan: 'flex'
        })
    });
</script>

<script>
    window.FontAwesomeConfig = {
        searchPseudoElements: true
    }
    moment.locale("en-AU");
</script>


    
    
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-MML-AM_CHTML"></script>
<script>
    MathJax.Hub.Config({
        "HTML-CSS": {
            matchFontHeight: false
        },
        SVG: {
            matchFontHeight: false
        },
        CommonHTML: {
            matchFontHeight: false
        },
        tex2jax: {
            inlineMath: [
                ['$','$'],
                ['\\(','\\)']
            ]
        }
    });
</script>

    
    
    
    
<script src="//cdnjs.cloudflare.com/ajax/libs/lightgallery/1.6.8/js/lightgallery-all.min.js"></script>
<script src="//cdnjs.cloudflare.com/ajax/libs/justifiedGallery/3.6.5/js/jquery.justifiedGallery.min.js"></script>
<script>
    (function ($) {
        $(document).ready(function () {
            if (typeof($.fn.lightGallery) === 'function') {
                $('.article.gallery').lightGallery({ selector: '.gallery-item' });
            }
            if (typeof($.fn.justifiedGallery) === 'function') {
                $('.justified-gallery').justifiedGallery();
            }
        });
    })(jQuery);
</script>

    
    
    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.0/clipboard.min.js"></script>
    <style>
        .hljs {
            position: relative;
        }

        .hljs .clipboard-btn {
            float: right;
            color: #9a9a9a;
            background: none;
            border: none;
            cursor: pointer;
        }

        .hljs .clipboard-btn:hover {
          color: #8a8a8a;
        }

        .hljs > .clipboard-btn {
            display: none;
            position: absolute;
            right: 4px;
            top: 4px;
        }

        .hljs:hover > .clipboard-btn {
            display: inline;
        }

        .hljs > figcaption > .clipboard-btn {
            margin-right: 4px;
        }
    </style>
    <script>
      $(document).ready(function () {
        $('figure.hljs').each(function(i, figure) {
          var codeId = 'code-' + i;
          var code = figure.querySelector('.code');
          var copyButton = $('<button>Copy <i class="far fa-clipboard"></i></button>');
          code.id = codeId;
          copyButton.addClass('clipboard-btn');
          copyButton.attr('data-clipboard-target-id', codeId);

          var figcaption = figure.querySelector('figcaption');

          if (figcaption) {
            figcaption.append(copyButton[0]);
          } else {
            figure.prepend(copyButton[0]);
          }
        })

        var clipboard = new ClipboardJS('.clipboard-btn', {
          target: function(trigger) {
            return document.getElementById(trigger.getAttribute('data-clipboard-target-id'));
          }
        });
        clipboard.on('success', function(e) {
          e.clearSelection();
        })
      })
    </script>

    
    

    



<script src="/js/script.js"></script>


    
    <div class="searchbox ins-search">
    <div class="searchbox-mask"></div>
    <div class="searchbox-container ins-search-container">
        <div class="searchbox-input-wrapper">
            <input type="text" class="searchbox-input ins-search-input" placeholder="Type something..." />
            <span class="searchbox-close ins-close ins-selectable"><i class="fa fa-times-circle"></i></span>
        </div>
        <div class="searchbox-result-wrapper ins-section-wrapper">
            <div class="ins-section-container"></div>
        </div>
    </div>
</div>
<script>
    (function (window) {
        var INSIGHT_CONFIG = {
            TRANSLATION: {
                POSTS: 'Posts',
                PAGES: 'Pages',
                CATEGORIES: 'Categories',
                TAGS: 'Tags',
                UNTITLED: '(Untitled)',
            },
            CONTENT_URL: '/content.json',
        };
        window.INSIGHT_CONFIG = INSIGHT_CONFIG;
    })(window);
</script>

<script src="/js/insight.js"></script>

    
</body>
</html>